InĀ [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
InĀ [2]:
data = pd.read_csv("StudentsPerformance.csv")

Global dataset visualizations¶

  • We can see that there girls and boys are almost equally represented.
  • Then for ethnicity, the majority is group C and the minority is group A
  • The least represented parental education level is master's degree, then bachelor's degree
  • A small majority have standard lunch whereas 35.5% have free or reduced lunch
  • The same goes for test preparation, a small majority of the students didn't complete preparation course whereas 35.8% of the students completed it
InĀ [3]:
fig = make_subplots(rows=2, cols=3,
                    specs=[[{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}],
                           [{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]])


def create_pie_chart(fig, labels, title, row, col, colors):
    fig.add_trace(
        go.Pie(
            labels=labels,
            values=None,
            title=title,
            titlefont={'size': 24},
        ),
        row=row, col=col
    )
    fig.update_traces(
        hoverinfo='label+value',
        textinfo='label+percent',
        textfont_size=10,
        marker=dict(
            colors=colors,
            line=dict(color='#000000', width=2)
        )
    )

colors = px.colors.sequential.Viridis
create_pie_chart(fig, data['gender'], 'Gender', 1, 1, colors)
create_pie_chart(fig, data['race/ethnicity'], 'Race', 1, 2, colors)
create_pie_chart(fig, data['parental level of education'], 'ParentEduc.', 1, 3, colors)
create_pie_chart(fig, data['lunch'], 'Lunch', 2, 1, colors)
create_pie_chart(fig, data['test preparation course'], 'TestPrep.', 2, 2, colors)

fig.layout.update(title="<b>Multivariate analysis<b>", showlegend=False, height=700, width=1100,
                template='plotly_dark', titlefont={'size': 24})
fig.show()
InĀ [4]:
fig = px.sunburst(data, path=['race/ethnicity', 'gender', 'parental level of education'])
fig.update_layout(title_text="<b>Parental level of education in comparison with race and genre<b>", 
                titlefont={'size': 24},
                width=800, 
                height=700,
                template='plotly_dark'
                )
fig.show()
InĀ [5]:
total_students = len(data)
colors = px.colors.sequential.Viridis
plots = []
categories = {
    'parental level of education': ["some high school", "high school", "associate's degree", "some college", "bachelor's degree", "master's degree"],
    'race/ethnicity': ["group A", "group B", "group C", "group D", "group E"],
    'lunch': ["standard", "free/reduced"],
    'test preparation course': ["none", "completed"]
}

for feature, cats in categories.items():
    source = data.groupby([feature, 'gender']).size().unstack().reset_index()
    source = source.rename(columns={'male': 'Male', 'female': 'Female'})
    source = source.fillna(0)

    # Convert the count to percentage
    source['Male'] = (source['Male'] / total_students) * 100
    source['Female'] = (source['Female'] / total_students) * 100

    fig = px.bar(source, x=feature, y=['Male', 'Female'], barmode='group', color_discrete_sequence=colors,
                labels={'value': '% of Students', 'variable': 'Gender'},
                title=f"{feature.capitalize()}",
                height=400,
                template='plotly_dark')

    fig.update_layout(legend=dict(orientation="v", y=1, yanchor="top", x=1.0, xanchor="right"), template='plotly_dark')

    plots.append(fig)

for plot in plots:
    plot.show()

Score visualizations¶

InĀ [6]:
fig = go.Figure()
fig.add_trace(go.Violin(x=data['math score'], line_color='salmon', name='Math'))
fig.add_trace(go.Violin(x=data['reading score'], line_color='gold', name= 'Reading'))
fig.add_trace(go.Violin(x=data['writing score'], line_color='lightseagreen', name='Writing'))

fig.update_traces(orientation='h', side='positive', width=3, points=False, meanline_visible=True)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)

fig.update_layout(title='<b> Test Score Comparison with stats (median quartile min max) <b>',
                titlefont={'size': 24,
                            'family':'San Serif',
                            },
                xaxis_title='Test scores',
                width=750,
                showlegend=False,
                template='plotly_dark'         
)
fig.show()
InĀ [7]:
def create_scatter_plot(data, gender_colors=None):
    if gender_colors is None:
        gender_colors = {'male': 'cornflowerblue', 'female': 'darkorange'}

    traces = []
    for gender, color in gender_colors.items():
        trace = go.Scatter(
            x=data[data['gender'] == gender]['math score'],
            showlegend=True,
            text=gender.capitalize(),
            y=data[data['gender'] == gender]['writing score'],
            name=gender.capitalize(),
            mode='markers',
            marker=dict(color=color, size=9, opacity=0.55)
        )
        traces.append(trace)

    layout = go.Layout(
        title='Math Score & Writing Score',
        xaxis=dict(title='Math Score'),
        yaxis=dict(title='Writing Score'),
        width=700,
        height=450,
        template='plotly_dark'
    )

    fig = make_subplots(rows=1, cols=1, subplot_titles=['Math and writing Score'], specs=[[{'type': 'scatter'}]])
    for trace in traces:
        fig.add_trace(trace)

    fig.update_layout(layout)
    return fig

gender_colors = {'male': 'purple', 'female': 'green'}
fig = create_scatter_plot(data, gender_colors)
fig.show()
InĀ [8]:
data_bp = [go.Box(x =data['reading score'],
              showlegend=False,
              name = 'Reading Score'),
       go.Box(x=data['writing score'],
              showlegend=False,
              name = 'Writing Score'),
       go.Box(x=data['math score'],
              showlegend=False,
              name = 'Math Score')]

layout_bp = go.Layout(title={'text': "Bopxplot of math, writing and reading scores",
                     'y':0.9,
                     'x':0.5,
                     'xanchor': 'center',
                     'yanchor': 'top'},
              width = 700,
              height=450,
              template='plotly_dark')

fig = go.Figure(data = data_bp, layout = layout_bp)
fig.show()

Other comparisons¶

InĀ [9]:
data_heatmap = [go.Heatmap(x=data['gender'],
              y= data['parental level of education'],
              z = data['math score'].values.tolist(),
              colorscale = 'Magma')]

layout_heatmap = go.Layout(title={'text': "Gender & Level of Education",
                     'y':0.9,
                     'x':0.5,
                     'xanchor': 'center',
                     'yanchor': 'top'},
              xaxis = dict(title='Gender'),
              yaxis =dict(title='Level of Education'),
              width=600,
              height=450,
              template='plotly_dark')

fig = go.Figure(data = data_heatmap, layout = layout_heatmap)
fig.show()
InĀ [10]:
sns.pairplot(data,hue = 'gender')
plt.show()
No description has been provided for this image
InĀ [11]:
px.bar(data_frame=data.groupby('race/ethnicity').agg({'math score' : 'mean','reading score' : 'mean','writing score' : 'mean'}), barmode='group',
       title = "<b>Ethnicity Analysis of scores</b>", template='plotly_dark')
InĀ [12]:
fig = ff.create_distplot([data[data['gender']=='male']['math score'], data[data['gender']=='female']['math score']], ['male', 'female'])
fig.update_layout(title={'text': 'Distribution of Math Scores by Gender', 'x':0.5,}, width=600, height=450, template='plotly_dark')
fig.show()